#X was added to all output CSVs to protect our original data
#trump full corpus
corpus<-tweets_01.08.2021

#call library and make initial visual
library(ggplot2)

#machine learning 
library(mallet)
#you know what this library does
library(dplyr)
#some useful text tools
library(tidytext)

#frequency analysis
freq<-corpus%>%
  unnest_tokens(word, text)%>%
  count(word)


#if you want to know what the stopwords are
stop_words$word
stop_words<-bind_rows(stop_words, stops)
tmp <- tempfile()
View(stop_words)

#actually crunches where the file will go
writeLines(stop_words$word, tmp)

docs <- mallet.import(corpus$text, corpus$text, tmp)

#my zipcode for seed
set.seed(97330)

#set topics
topic_model <- MalletLDA(num.topics = 50)
#the model then loads the docs we specified above
topic_model$loadDocuments(docs)
#lets train the model 2000 times. 
topic_model$train(2500)

#perplexity

#output relevant documents
doc.topics<-mallet.doc.topics(topic_model, normalized = TRUE, smoothed = TRUE)
topic.words<-mallet.topic.words(topic_model, normalized = TRUE, smoothed = TRUE)

to#and this is my personal absolute favorite - this is a dendrogram (like a family tree) of how the topics relate to each other
plot(mallet.topic.hclust(doc.topics, topic.words, balance = .5))

#these are the topic labels, which means the keys the process created to see the key contents
result <-mallet.topic.labels(topic_model, topic.words, num.top.words = 10)
result2<-data.frame(result)
doc.topics2<-data.frame(doc.topics)
wide_corpus<-bind_cols(corpus, topic.words, doc.topics2)
#write.csv(wide_corpus, "wide_corpusZ.csv", row.names = FALSE)
#write.csv(result2, "result2.csv", row.names = FALSE)
#View(wide_corpus)

#reload
wide_corpus<-wide_corpusZ
#Big ol Chunk for adding easily manipulated dates and topic numbers
library(tidyr)
library(dplyr)
just_topics<-wide_corpus%>%
  select(id, X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,
         X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30, X31, X32,X33,X34,
         X35,X36,X37,X38,X39,X40,X41,X42,X43,X44,X45,X46,X47,X48,X49,X50)
jt2<-pivot_longer(just_topics, -id)

library(lubridate)
times<-ymd_hms(wide_corpus$date)
big<-bind_cols(wide_corpus, times)

big2<-big%>%select(id, date, text, X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,
                   X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30, X31, X32,X33,X34,
                   X35,X36,X37,X38,X39,X40,X41,X42,X43,X44,X45,X46,X47,X48,X49,X50)
d3<-pivot_longer(big2, -c(id, date, text))
d4<-d3%>%
  filter(value>.5)
View(d4)

#code to pull this together
beforethirteen<-arrange(d4, date, text)%>%
  filter(year(mdy_hm(date))<2013)

#all tweets from 2013-declaration to run
thirteen_to_declaration<-bind_rows(arrange(d4, date, text)%>%
                                     filter(year(mdy_hm(date))>2012 & year(mdy_hm(date))<2015),
                                   arrange(d4, date) %>% 
                                     filter(year(mdy_hm(date))==2015 & month(mdy_hm(date))<7), 
                                     arrange(d4, date) %>%
                                     filter(year(mdy_hm(date))==2015 & month(mdy_hm(date))==7 & day(mdy_hm(date))<16))

#declaration to election day
declartion_to_election<-bind_rows(arrange(d4, date) %>% 
                                    filter(year(mdy_hm(date))==2015 & month(mdy_hm(date))==7 & day(mdy_hm(date))>16),
                                  arrange(d4, date) %>% 
                                    filter(year(mdy_hm(date))==2015 & month(mdy_hm(date))>7),
                                  arrange(d4, date) %>% 
                                    filter(year(mdy_hm(date))==2016 & month(mdy_hm(date))<11),
                                  arrange(d4, date) %>% 
                                    filter(year(mdy_hm(date))==2016 & month(mdy_hm(date))==11 & day(mdy_hm(date))<7))

#begin to midterms
election_to_midterm<-bind_rows(arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2016 & month(mdy_hm(date))==11 & day(mdy_hm(date))>7),
                               arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2016 & month(mdy_hm(date))==12),
                               arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2017),
                               arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2018 & month(mdy_hm(date))<11), 
                               arrange(d4, date) %>% 
                                  filter(year(mdy_hm(date))==2018 & month(mdy_hm(date))==11 & day(mdy_hm(date))<7))

#until the pandemic
midterm_to_pandemic<-bind_rows(arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2018 & month(mdy_hm(date))==11 & day(mdy_hm(date))>6),
                               arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2018 & month(mdy_hm(date))==12),
                               arrange(d4, date) %>% 
                                 filter(year(mdy_hm(date))==2019))

#pandemic to the election
pandemic_to_election<-bind_rows(arrange(d4, date) %>% 
                                  filter(year(mdy_hm(date))==2020, month(mdy_hm(date))<11),
                                arrange(d4, date) %>% 
                                  filter(year(mdy_hm(date))==2020, month(mdy_hm(date))==11, day(mdy_hm(date))>7))


#all dates after the electin
end_stage<-bind_rows(arrange(d4, date) %>% 
                       filter(year(mdy_hm(date))==2020, month(mdy_hm(date))==11, day(mdy_hm(date))<6),
                     arrange(d4, date) %>% 
                       filter(year(mdy_hm(date))==2020, month(mdy_hm(date))==12),
                     arrange(d4, date) %>% 
                       filter(year(mdy_hm(date))==2021))


#code to produce data frames for gephi
#X is dataframe to injest
net<-function(X){
  A<-"X51"
  B<-"X51"
  C<-X$name
  C<-data.frame("SOURCE"=C)
  A<-data.frame("SOURCE"=A)
  D<-bind_rows(A, C) 
  C<-X$name
  C<-data.frame("TARGET"=C)
  A<-data.frame("TARGET"=B)
  E<-bind_rows(C, A)
  G<-bind_cols(D,E)
  Result<<-G
}
net(declartion_to_election)

#stavroz code was used to create the periodized CSV files
#write.csv(Result, "stavroz2.csv", row.names = FALSE)

library(network)
network(Result, loops=TRUE, multiple=TRUE)


#markov process analysis code
library(markovchain)
library(network)
#Result is produced by function net
R1<-Result
library(igraph)
#edgelist into matrix
R2<-as.matrix(R1)
#graph from matrix
R3<-graph_from_edgelist(R2)
#into a proper adj matrix
R4<-as_adjacency_matrix(R3)
#turns the dgCmatrix back to a normal matrix
R5<-as.matrix(R4)
#other code that made sense in a prior project
lisa<-R5
smithers<-rowSums(R5)
mrburns<-lisa*1/smithers
View(mrburns)

#get the list
library(network)
#create network
edge_network<-network(R1, directed=TRUE, loops=TRUE, multiple=TRUE)
#cross load vertex names as states
states<-network.vertex.names(edge_network)
#spec rows
byRow<-TRUE

library(markovchain)
#load markov chain
scrubsMC<-new("markovchain", states=states,
              transitionMatrix=mrburns, byrow=byRow)

#code to generate chains
splorp<-rmarkovchain(n = 10, object = scrubsMC, t0 = "X6")
splorp

library(ggplot2)

#code to make other graphics
wide_corpus %>% 
  ggplot(aes(date, favorites, colour=isRetweet))+geom_jitter()

#HOW TO MAKE THE FIGURE
wcX<-wide_corpus %>% 
  mutate(easy_date = mdy_hm(date))

OP<-wcX %>% 
  group_by(date(easy_date)) %>% 
  count()
View(OP)
OP %>% 
  ggplot(aes(`date(easy_date)`,n))+geom_jitter()+scale_y_log10()+ylim(1,160)+
  labs(x="Date", y="Tweets Assigned to Category", title="Twitter Use Volume Detected Per Day")






library(dplyr)
library(ggplot2)
library(lubridate)
G<-wide_corpus$date
library(stringr)
L<-str_split(G, " ", n =2, simplify = TRUE)
View(L)
LLbean<-data.frame(L)
Hollister<-LLbean %>% 
  count(X1)

election<-election_to_midterm%>%
  mutate(processdate=date(dmy(X1)))

Abercrombie %>% 
  filter(!is.na(processdate)) %>% ggplot(aes(processdate, n))+geom_jitter()



summary(year(ymd_hms(end_stage$date)))
  

